1 Objectives & Hypotheses

  1. Conduct an ancestry-stratified epigenome wide association study of prenatal maternal smoking and DNA methylation from child saliva samples at age nine and age fifteen. B) Examine differences in methylation trajectories from age nine to fifteen of the most significant methylation sites identified in the epigenome-wide association study between those exposed vs unexposed to prenatal smoking. C) Develop a saliva epigenetic classifier of prenatal smoke exposure using a machine learning approach and support vector machines and evaluate the agreement of the classifier at two time points.

Hypotheses: I hypothesize that different CpG loci will be associated with maternal prenatal smoking in African ancestry groups than in European and Hispanic ancestry groups. I hypothesize that the direction of associations between CpGs and prenatal maternal smoking will stay consistent between ages nine and fifteen but that the magnitude of associations will decline over time. Finally, I hypothesize that classification of prenatal maternal smoking status based upon DNA methylation information from salivary samples will be accurate and consistent at ages nine and fifteen.

2 Reading in data and preprocessing:

2.1 Labeling phenotype data

2.2 Cleaning pdqc data

2.2.1 Figure out analytic subsets 1) any methylation data; 2) 2-visits of methylation data

pdqc_all<-readRDS(paste0(datadir, 'OGData/', "pd_qc.rds"))
pdqc<-pdqc_all %>% filter(probe_fail_10==0 & sex==predicted_sex)
#What FF ids have methylation data?
methy_ids<-pdqc$id
#create slide variable fo pdqc 
pdqc$slide <- gsub('_.*', '', pdqc$MethID)
pdqc$array<-gsub('.*_', '', pdqc$MethID)
#ids with 9visit, 15visit or both visit
visit9ids<-pdqc%>%filter(childteen=='C')%>%pull(idnum)%>%unique()
visit15ids<-pdqc%>%filter(childteen=='T')%>%pull(idnum)%>%unique()
bothvisit_ids<-visit9ids[visit9ids %in% visit15ids]
visit9only_ids<-visit9ids[!(visit9ids %in%bothvisit_ids)]
visit15only_ids<-visit15ids[!(visit15ids %in%bothvisit_ids)]
#create flag in FF for having methylation data
#and flag for having 2 visits of methylation data
FF_labeled<-FF_labeled %>% 
  mutate(MethylData=ifelse(id %in% methy_ids, "Analysis subset", "Not in analysis subset"),
         TwoVisitData=ifelse(id%in%bothvisit_ids,"Analysis subset - both visits", "Not in both visit analysis subset"))
table(FF_labeled$MethylData)
## 
##        Analysis subset Not in analysis subset 
##                    880                   4018
table(FF_labeled$TwoVisitData)
## 
##     Analysis subset - both visits Not in both visit analysis subset 
##                               805                              4093
if((length(visit15only_ids)+length(visit9only_ids)+length(bothvisit_ids)!=length(FF_labeled %>% filter(MethylData=="Analysis subset")%>%pull(id)))){print("WARNING ID LENGTHS DO NOT ADD UP")}
if(length(bothvisit_ids)!=length(FF_labeled %>% filter(TwoVisitData=="Analysis subset - both visits")%>%pull(id))){print("WARNING ID LENGTHS DO NOT ADD UP")}

2.2.2 Filter technical replicates (only 1 technical replicate per person-visit + decide based on QC variables - pick the one with the lower probe fail percentage)

#technical replicates will be duplicate id + childteen variables from pdqc
pdqc<-pdqc %>% mutate(newID=paste(idnum, childteen, sep='_'))
techrep_ids<- pdqc %>% select(newID) %>% group_by(newID)%>%filter(n()>1)
techreps<-pdqc %>% filter(newID %in% techrep_ids$newID)%>%arrange(idnum,childteen)
hiPctPF<-techreps %>% group_by(newID) %>%filter(probe_fail_pct==max(probe_fail_pct))%>%pull(MethID)
pdqc_clean<-pdqc %>% filter(!(MethID %in% hiPctPF) & childteen!='M')
myMethIDs<-pdqc_clean %>% pull(MethID)

2.3 Methylation summary variables

2.3.1 Clocks

clocks<-read.table(file=paste0(datadir, "OGData/DNAm_clocks_ffcw.txt"), header = T)
colnames(clocks)[1]<-"MethID"

2.3.2 Global methylation score

Cell type proportions estimated using epidDISH package and the EpiFibIC reference (generic for epithelial tissues).

betaqc<-readRDS(file=paste0(datadir, 'OGData/', "noob_filtered.rds"))
aprioriCG<-c("cg05575921", "cg04180046", "cg05549655", "cg14179389")
globalmethy<-colMeans(betaqc, na.rm=T)
globalmethdf<-as.data.frame(cbind("MethID"=names(globalmethy), "globalmethylation"=globalmethy))
aprioriCGdf<-as.data.frame(t(betaqc[aprioriCG, ]))
aprioriCGdf$MethID = colnames(betaqc)
#cell type proportions 
celltypes<-epidish(beta.m = betaqc, ref.m = centEpiFibIC.m, method = "RPC")
estF_FF<-as.data.frame(celltypes$estF)
estF_FF$MethID = row.names(estF_FF)
#read in cpgs from PMC4833289
cpgs<-read.csv(paste0(datadir, 'PMC4833289_SuppFile2CpGs.csv'), stringsAsFactors = F, header=F, skip=2)
#make headers
header1<-scan(paste0(datadir, 'PMC4833289_SuppFile2CpGs.csv'), nlines=1, what=character(), sep=',')
header1[7:26]<-c(rep("SSnewborn", 5), rep("SSnewbornCT", 5), rep("SSolder", 5), rep("anynewborn", 5))
header2<-paste(header1, scan(paste0(datadir, 'PMC4833289_SuppFile2CpGs.csv'), nlines=1, skip=1, what=character(), sep=','), sep='_')
names(cpgs)<-header2
#four meta-EWASes: SS_newborn (sustained smoking in newborns); SS_newbornCT (sustained smoking in newborns controlling for cell type) SS_older (sustained smoking in older children); any_newborn (any smoking in newborns)
#pivot_longer to get single column of coefs and a column for the analysis type
#and group into a list by analysis type
cpgs<-cpgs %>% pivot_longer(cols = c(matches("_Coef|_SE|_P|_FDR|_Direction")), names_to = c("set", ".value"), names_pattern="(.+)_(.+)")%>%rename_with(~gsub('_', '', .x))%>%group_split(set)
cpgs<-cpgs%>%setNames(lapply(cpgs, function(x){x$set %>% unique()}))
#make scores - no transform, zscore and centermean
#first, pull script 
source(file.path(codedir, 'UsefulCode', 'makePolyEpiScores.R'))
scores=makePolyEpiScore(m=betaqc, b=cpgs)
## [1] "transforming the methylation matrix:none"
## [1] "transforming the methylation matrix:none"
## [1] "transforming the methylation matrix:none"
## [1] "transforming the methylation matrix:none"
scores_z=makePolyEpiScore(m=betaqc, b=cpgs, transformopt = 'zscore')
## [1] "transforming the methylation matrix:zscore"
## [1] "transforming the methylation matrix:zscore"
## [1] "transforming the methylation matrix:zscore"
## [1] "transforming the methylation matrix:zscore"
scores_center=makePolyEpiScore(m=betaqc, b=cpgs, transformopt = 'meancenter')
## [1] "transforming the methylation matrix:meancenter"
## [1] "transforming the methylation matrix:meancenter"
## [1] "transforming the methylation matrix:meancenter"
## [1] "transforming the methylation matrix:meancenter"
polyscores=list('notransform'=scores, 'zscore'=scores_z, 'center'=scores_center)
polyscores<-lapply(polyscores, function(x) x%>% mutate(MethID=colnames(betaqc)))
save(polyscores, file = file.path(datadir, 'CreatedData', 'polymethylationscores.RDS'))
polyscores_wide<-lapply(seq_along(polyscores), function(i) polyscores[[i]] %>% setNames(c(paste0(colnames(polyscores[[i]])[1:4], '_', names(polyscores)[i]), 'MethID')))%>%reduce(left_join, by='MethID')
clocks2<-clocks %>% mutate(idnum=gsub('a', '', idnum))%>%filter(MethID %in% pdqc$MethID) # currently clocks are missing 3 individuals who were in johns pipeline but not jonahs

methyldata<-left_join(left_join(left_join(left_join(left_join(pdqc_clean %>% select(MethID, slide, array,childteen, idnum), globalmethdf), clocks2), estF_FF), aprioriCGdf), polyscores_wide)
## Joining, by = "MethID"
## Joining, by = c("MethID", "idnum")
## Joining, by = "MethID"
## Joining, by = "MethID"
## Joining, by = "MethID"
methyldata[, c("globalmethylation", aprioriCG)]<-lapply(methyldata[, c("globalmethylation", aprioriCG)],function(x) as.numeric(as.character(x)))
#lapply(methyldata, class)
methyldata_wide <- methyldata %>% select(-MethID)%>% pivot_wider(., names_from=childteen, values_from=any_of(colnames(methyldata)[4:length(colnames(methyldata))]))
#weirdID<-methyldata %>% filter(is.na(globalmethylation))%>%pull(MethID)
#globalmethy[weirdID] #not actually missing global methylation data
#clocks %>% filter(MethID%in%weirdID) # #idnum from  not the same as in
#pdqc_clean %>% filter(MethID==weirdID) #this is just a typo in the idnum from data entry, see format of other #idnums
#pdqc_clean %>% pull(idnum) %>% head()
#solution: when creating methyldata, first trim alphabet from clocks$idnum that contain alphabetic characters --> implemented
#also misssing clocks for 3 sex outlier samples that weren't in jonahs' pipeline: 
table(is.na(methyldata$horvath13))
weirdIDs<-methyldata %>% filter(is.na(horvath13))%>%pull(MethID)
pdqc_all %>% filter(MethID %in% weirdID) %>% xtabs(~predicted_sex_outlier, data=.)

2.4 Creation of new variables

2.4.1 Binary smoking variable

FF_labeled<-FF_labeled %>%mutate(smkPreg_binary=ifelse(m1g4%in% c("1 2+pk/d","2 1<pk<2","3 <1pk/d" ), "Yes", ifelse(m1g4%in%c("-2 Don't know", "-3 Missing", "-9 Not in wave"), NA, ifelse(m1g4== "4 None", "No", ifelse(m1g4=="Missing", "Missing", "FLAG")))))
#xtabs(~m1g4+smkPreg_binary, data=FF_labeled)

2.4.2 Ancestry variables

Ancestry is based on csv files of ancestry-specific PCS created by Erin (per Jonah meeting Oct 6 2020). If an ID was in {african/european/hispanic}, then the individual is labeled as such.

# read in PCs
euroPC<-read.csv("/scratch/bfoxman_root/bfoxman/blostein/FragileFamilies/Files/OGData/pcs/european.csv")
afrPC<-read.csv("/scratch/bfoxman_root/bfoxman/blostein/FragileFamilies/Files/OGData/pcs/african.csv")
hisPC<-read.csv("/scratch/bfoxman_root/bfoxman/blostein/FragileFamilies/Files/OGData/pcs/hispanic.csv")
allPC<-read.table("/scratch/bfoxman_root/bfoxman/blostein/FragileFamilies/Files/OGData/pcs/global_pcs.txt", col.names=c("X", "idnum", paste("PC", 1:20, sep='')))
#individuals in each ancestry specific pc group can be labeled as that ancestry for categorical ancestry variable 
FF_labeled<-FF_labeled %>% mutate(ancestry=case_when(MethylData=="Analysis subset" & id %in% hisPC$idnum~"Hispanic ancestry", MethylData=="Analysis subset" & id %in%afrPC$idnum~"African ancestry", MethylData=="Analysis subset" & id %in% euroPC$idnum~"European ancestry", MethylData=="Analysis subset" & !(id %in%c(hisPC$idnum, afrPC$idnum, euroPC$idnum)) & (id %in% allPC$idnum)~"Other ancestry", MethylData=="Analysis subset" & !(id %in%c(hisPC$idnum, afrPC$idnum, euroPC$idnum)) & !(id %in% allPC$idnum)~"Missing PC data", MethylData!="Analysis subset"~"Not in analysis subset"))
#check ancestry variable
#table(FF_labeled$ancestry)
#with(FF_labeled, xtabs(~ancestry+cm1ethrace))
MissingPCdata<-FF_labeled %>% filter(ancestry=="Missing PC data")%>%select(id)
save(MissingPCdata, file = "/scratch/bfoxman_root/bfoxman/blostein/FragileFamilies/Files/CreatedData/MissingPC.Rdata")
#add PC data to FF_labeled 
FF_labeled<-left_join(FF_labeled, allPC %>% mutate(id=as.character(idnum)) %>% select(id, PC1, PC2)%>%rename_at(vars(-id), function(x) paste0('global_', x, sep='')))
## Joining, by = "id"
localPC<-rbind(euroPC, afrPC, hisPC)
FF_labeled<-left_join(FF_labeled, localPC  %>% mutate(id=as.character(idnum)) %>% select(id, PC1, PC2)%>% rename_at(vars(-id), function(x) paste0('local_', x, sep='')))
## Joining, by = "id"

There were 47 individuals without PC data. None of these individuals had data in the global PC file, so I assume they are actually missing genetic data (for whatever reason, possibly QC).

2.4.3 Prenatal exposure variables

Recoded frequency of prenatal alcohol drinking and drug use into Yes/no binary variables for drinking and drug use during pregnancy.

#table(FF_labeled$m1g2) # during preg, alchohol freq
#table(FF_labeled$m1g3) # during preg, drugs freq 
with(FF_labeled, table(m1g2, m1g3))
##           m1g3
## m1g2       1 E day 2 Svl/wk 3 Svl/mn 4 <1X/mn 5 Never Missing
##   1 E day        1        2        0        1       4       0
##   2 Svl/wk       1        7        2        2      15       0
##   3 Svl/mn       2        5       16       12      48       0
##   4 <1X/mn       6        1       14       58     324       0
##   5 Never       17       17       27       78    4219       6
##   Missing        0        0        2        0       6       5
FF_labeled<-FF_labeled %>% mutate_at(c("m1g2", "m1g3"), .funs=funs(YesNoPreg=ifelse(.=="5 Never", "No", ifelse(. =="Missing", "Missing", "Yes"))))
with(FF_labeled, table(m1g2, m1g2_YesNoPreg))
##           m1g2_YesNoPreg
## m1g2       Missing   No  Yes
##   1 E day        0    0    8
##   2 Svl/wk       0    0   27
##   3 Svl/mn       0    0   83
##   4 <1X/mn       0    0  403
##   5 Never        0 4364    0
##   Missing       13    0    0
with(FF_labeled, table(m1g3, m1g3_YesNoPreg))
##           m1g3_YesNoPreg
## m1g3       Missing   No  Yes
##   1 E day        0    0   27
##   2 Svl/wk       0    0   32
##   3 Svl/mn       0    0   61
##   4 <1X/mn       0    0  151
##   5 Never        0 4616    0
##   Missing       11    0    0
#table(FF_labeled$m1g5) # in last year, alc/drugs interfered with work/relationships
#table(FF_labeled$m1g6) # ever sought help for drug/alc problems

2.5 Postnatal smoke exposure

Maternal postnatal smoking at different survey points is fairly correlated, and alluvial plots of changes over time show broad consistency, so I decided to create a single smoking variable for maternal smoking at age 1 and age 5 - if mothers were smoking at either age 1 or age 5 then “Maternal smoking at age 1 or 5”, if not smoking at age 1 and age 5 then “No maternal smoking at age 1 and age 5” and then if missing data at one age and no at the other, then “Missing” (and if missing at both ages, “Missing”). Also created a similar dose variable capturing maternal packs per day that could be used instead. Decided to use just age 1 and age 5 because at these two time points the mother was asked (as opposed to PCG, also the dataset of Fragile Families metadata is missing PCG questions from age 3 about smoking) and they both preceed the actual saliva collection for both saliva collection age points. Then use a smoking dose variable from the mother or PCG interview at wave of saliva collection as an additional second hand smoke exposure variable.

#Baseline (1) -> Year 1/Age 1 (2) ->Year 3/Age3 (3) -> Year 5 (4) ->Year 9 (5) -> year 15 (6)
#Do you smoke in past month (how many packs per day)
#Mom            ->m2j5 (m2j5a)    ->                  -> m4j18 (m4j19) /p4 -> m5g17 (m5g18) / n5f17 (n5f18) 
#PCG                              p3a23a(p3a23b)                              n5f17 (n5f18)-> p6h76 (p6h77)
#seem to be missing p3 variables (PCG survey year 3) 
#number of people in household who smoke 
#p3a23->p5h15b->p6h78

#hours child around smoke 
#p3a21 -> p4a24->p5h15c

#with(FF_labeled, table(m2j5, m4j18))
#with(FF_labeled, table(m4j18, m5g17))
#with(FF_labeled, table(n5f17, m5g17))
#with(FF_labeled, table(p6h76, m5g17))


plot_grid(FF_labeled %>% select(m2j5, m4j18, m5g17, n5f17, p6h74) %>% mutate_all(funs(as.numeric(.))) %>%ggcorr(label=T)+ggtitle("Correlation between Yes/No smoking (mother) \n at years 1, 5, & 9 & PCG at years 9 & 15")
,FF_labeled %>% select(m2j5a, m4j19, m5g18, n5f18, p6h77) %>% mutate_all(funs(as.numeric(.))) %>%ggcorr(label=T)+ggtitle("Correlation between packs per day smoking (mother) \n at years 1, 5, 9 & PCG at years 9 and 15"), nrow=2)

yesnosmkalluv<-to_lodes_form(as.data.frame(FF_labeled %>% select(m2j5, m4j18, m5g17, p6h74) %>% mutate(m5g17=stringr::str_to_title(m5g17))%>% group_by(m2j5, m4j18, m5g17, p6h74) %>% count()), axes=1:4, id='Cohort')

#ggplot(yesnosmkalluv, aes(x=x, stratum = stratum, alluvium=Cohort, fill=stratum, label=stratum, y=n))+geom_flow(stat="alluvium", lode.guidance="backfront", color='darkgrey')+geom_stratum()+scale_x_discrete(labels=c("Age 1", "Age 5", "Age 9"))

smkdose<-to_lodes_form(as.data.frame(FF_labeled %>% select(m2j5a, m4j19, m5g18, p6h77)%>% mutate_at(vars(c("m2j5a", "m4j19", "m5g18")), funs(factor(substring(., 1, 1), labels=levels(FF_labeled$m2j5a)))) %>% group_by(m2j5a, m4j19, m5g18, p6h77) %>% count()), axes=1:4, id='Cohort')
#ggplot(smkdose, aes(x=x, stratum = stratum, alluvium=Cohort, fill=stratum, label=stratum, y=n))+geom_flow()+geom_stratum()+scale_x_discrete(labels=c("Age 1", "Age 5", "Age 9"))

plot_grid(ggplot(yesnosmkalluv, aes(x=x, stratum = stratum, alluvium=Cohort, fill=stratum, label=stratum, y=n))+geom_flow()+geom_stratum()+scale_x_discrete(labels=c("Age 1", "Age 5", "Age 9", "Age 15"))
+ggtitle("Alluvial plot of Yes/No smoking variables from age 1 to age 9 (mother) and age 15 (PCG)"), ggplot(smkdose %>% filter(!(stratum %in% c("-6 Skip", "Missing"))), aes(x=x, stratum = stratum, alluvium=Cohort, fill=stratum, label=stratum, y=n))+geom_flow()+geom_stratum()+scale_x_discrete(labels=c("Age 1", "Age 5", "Age 9", "Age 15"))+ggtitle("Alluvial plot of smoking per day among those smoking from age 1 to age 9 (mother) and 15 (PCG)"), nrow=2)

#ggplot(FF_labeled, aes(x=p5h15b, y=p6h78))+geom_jitter()+stat_cor()+scale_x_continuous('# smoking in house age 9')+scale_y_continuous('# smoking in house age 15')+ggtitle("# individuals smoking in house correlation age 9 and age 15")

#Make any maternal postnatal smoking variable & dose categorization for maternal smoking at ages 1 & 5 
FF_labeled<-FF_labeled %>% mutate(PostnatalMaternalSmokingAny = case_when(
                                m2j5 =="1 Yes" ~ "Maternal smoking at age 1 or age 5",
                                m4j18=="1 Yes" ~ "Maternal smoking at age 1 or age 5",
                                m2j5 == "Missing" & m4j18=="Missing" ~ "Missing",
                                m2j5 == "Missing" & m4j18=="2 No" ~ "Missing",
                                m2j5 == "2 No" & m4j18=="Missing" ~ "Missing",
                                m2j5 == "2 No" & m4j18=="2 No" ~ "No maternal smoking at age 1 and age 5")) %>%
  mutate(PostnatalMaternalSmokingDose=case_when(
                                m2j5a %in% c("2 1pk/d", "3 1.5pk/d", "4 2pk/d", "5 >2pk/d") &
                                m4j19 %in% c("2 About pack/day", "3 Pack and half/day", "4 2 packs/day", "5   
                                          More 2 packs/day") ~ "Mom - pack/d or greater at both age 1 & 5", 
                                m2j5a %in% c("2 1pk/d", "3 1.5pk/d", "4 2pk/d", "5 >2pk/d") &
                                !(m4j19 %in% c("2 About pack/day", "3 Pack and half/day", "4 2 packs/day", "5 
                                          More 2 packs/day")) ~ "Mom - pack/d or greater at either age 1 or 5",
                                !(m2j5a %in% c("2 1pk/d", "3 1.5pk/d", "4 2pk/d", "5 >2pk/d")) &
                                m4j19 %in% c("2 About pack/day", "3 Pack and half/day", "4 2 packs/day", "5 
                                          More 2 packs/day") ~ "Mom - pack/d or greater at either age 1 or 5",
                                m2j5a %in% c("1 <1/2 pk/d") | m4j19 %in% c("1 Less half pack/day")~"Mom - less than half pack/d at either age 1 or 5", 
                                m2j5a %in% c("-6 Skip") & m4j19 %in% c("-6 Skip")~
                                  "No maternal smoking at age 1 or 5", 
                                m2j5a %in% c("Missing") & m4j19 %in% c("-6 Skip", "Missing")~"Missing", 
                                m2j5a %in% c("Missing", "-6 Skip") & m4j19 %in% c("Missing")~"Missing"))

with(FF_labeled, table(PostnatalMaternalSmokingAny, m2j5, m4j18))
## , , m4j18 = 1 Yes
## 
##                                         m2j5
## PostnatalMaternalSmokingAny              1 Yes 2 No Missing
##   Maternal smoking at age 1 or age 5       868  305      81
##   Missing                                    0    0       0
##   No maternal smoking at age 1 and age 5     0    0       0
## 
## , , m4j18 = 2 No
## 
##                                         m2j5
## PostnatalMaternalSmokingAny              1 Yes 2 No Missing
##   Maternal smoking at age 1 or age 5       168    0       0
##   Missing                                    0    0     183
##   No maternal smoking at age 1 and age 5     0 2526       0
## 
## , , m4j18 = Missing
## 
##                                         m2j5
## PostnatalMaternalSmokingAny              1 Yes 2 No Missing
##   Maternal smoking at age 1 or age 5       123    0       0
##   Missing                                    0  371     273
##   No maternal smoking at age 1 and age 5     0    0       0
#with(FF_labeled, table(m2j5a, m4j19, PostnatalMaternalSmokingDose))
#with(FF_labeled, table(PostnatalMaternalSmokingAny, PostnatalMaternalSmokingDose))

Maybe here best to have 1 variable capturing any maternal postnatal smoking at age 1 & 5 and then use in each crossectional model the dose of maternal postnatal smoking at current visit (and similarly for linear mixed effect model)?

varLabels2<-c(paste(colnames(FF_labeled)[1:66], varLabels), "Has any methylation data?", "Has two visits of methylation data", "Any maternal smoking during pregnancy", "Ancestry from PCA of child's genetic data", "PC 1 (global)", "PC 2 (global", "PC 1 (within ancestry category)", "PC 2 (within ancestry category",
"Any alcohol during pregnancy", "Any drugs during pregnancy", "Any postnatal smoking (age 1 & 5)", "Postnatal smoking dose at age 1 & 5")
FF_labeled<-set_label(FF_labeled, varLabels2)
  
myFF<-FF_labeled %>% filter(MethylData=="Analysis subset")%>%mutate(idnum=id)
myFF<-left_join(myFF, methyldata)
## Joining, by = "idnum"
varLabels3<-c(varLabels2, "ID", "Methylation ID", "Slide", "Array (RC)", "Visit", "Global methylation", "horvath13 methylation clock", "horvath18 methylation clock", "pediatric methylation clock", "levine methylation clock", "Epithelial cell proportion", "Fibroid cell proportion", "Immune cell proportion", "cg05575291 methylation", "cg04180046 methylation",  "cg05549655 methylation", "cg14179389 methylation", "PES: Any smoking, newborn cordblood", "PES: Sustained smoking, newborn cordblood", "PES: Sustained smoking, newborn cordblood (+CT control)", "PES: Sustained smoking, older children peripheral blood", "PES: Any smoking, newborn cordblood - zscore", "PES: Sustained smoking, newborn cordblood - zscore", "PES: Sustained smoking, newborn cordblood (+CT control) - zscore", "PES: Sustained smoking, older children peripheral blood - zscore", "PES: Any smoking, newborn cordblood - mean center", "PES: Sustained smoking, newborn cordblood - mean center", "PES: Sustained smoking, newborn cordblood (+CT control) - mean center", "PES: Sustained smoking, older children peripheral blood - mean center")
myFF<-set_label(myFF, varLabels3)

2.5.1 Child smoking

Exclude children who reported smoking at age 9 and age 15. Interestingly, the children who report smoking at age 9 do not report smoking at age 15, and the parental perception of child smoking at age 9 does not match the children who say they have ever smoked a cigarrette. For now using the ‘ever smoked’ variable from age 15 for the exclusion, but there is also a ‘smoked in past month’ that we could use instead (lose fewer children).

with(myFF, table(k5f1l,childteen)) # 6 kids smoking at age 9 
##          childteen
## k5f1l       C   T
##   1 yes     6   6
##   2 no    816 835
##   Missing   7  15
with(myFF, table(p5q3cr, k5f1l)) # not the same kids whose parents say somewhat or very true that the child is using tobacco
##                               k5f1l
## p5q3cr                         1 yes 2 no Missing
##   1 not true                      12 1608      14
##   2 somewhat or sometimes true     0    4       0
##   3 very true or often true        0    2       0
##   Missing                          0   37       8
with(myFF, table(k6d40, childteen)) # 41 kids say ever smoked a cigarette at age 15
##          childteen
## k6d40       C   T
##   1 Yes    41  40
##   2 No    779 809
##   Missing   9   7
#with(myFF, summary(k6d41))
with(myFF, table(k6d42, childteen)) # of these only 13 say they've smoked in the past month 
##                         childteen
## k6d42                      C   T
##   -6 Skip                779 809
##   1 Never                 28  28
##   2 Once or twice a week  10  10
##   3 3 to 5 days a week     0   0
##   4 6 to 7 days a week     3   2
##   Missing                  9   7
#with(myFF, summary(k6d43))
with(myFF, table(k5f1l, k6d40, childteen))# not the same kids at age 9 and 15
## , , childteen = C
## 
##          k6d40
## k5f1l     1 Yes 2 No Missing
##   1 yes       1    5       0
##   2 no       40  769       7
##   Missing     0    5       2
## 
## , , childteen = T
## 
##          k6d40
## k5f1l     1 Yes 2 No Missing
##   1 yes       1    5       0
##   2 no       39  791       5
##   Missing     0   13       2

2.5.2 Maternal or PCG smoking dose at time of sample

myFF<-myFF %>% mutate(SmkAtVisitPastmonth=case_when(childteen=='C' & m5g17=='1 yes' & m5g18 %in% c("2 about a pack", "3 a pack and a half", "4 about 2 packs", 
                              "5 more than two  packs") ~ "Pack or more a day", 
                              childteen=='C' & m5g17=='1 yes' & m5g18=="1 less than half a pack a day" ~ "Less than pack a day", 
                              childteen=='C' & n5f17=="1 yes" & n5f18 =='2 about a pack' ~ "Pack or more a day",
                              childteen=='C' & n5f17=="1 yes" & n5f18 =="1 less than half a pack day" ~ "Less than pack a day",
                              childteen=='C' & m5g17=="1 yes" & m5g18 %in% c("-6 Skip", "Missing")~"Missing",
                              childteen == 'C' & n5f17%in%c("-7 N/A", "2 no") & m5g17=="2 no" ~ "No smoking", 
                              childteen == 'C' & n5f17%in%c("2 no") & m5g17%in%c("2 no", "Missing") ~ "No smoking", 
                              childteen == 'C' & n5f17 %in% c("Missing", "-7 N/A") & m5g17 =="Missing" ~ "Missing", 
                              childteen=='T' & p6h76 %in% c("-6 Skip", "0 None") ~ "No smoking", 
                              childteen=='T' & p6h77%in% c("1 5 or fewer cigarettes per day", "2 About half a pack a day, 10 cigarettes")~"Less than pack a day", 
                              childteen=='T' & p6h77 %in% c("3 About a pack a day, 20 cigarettes", "4 More than 1 pack a day")~"Pack or more a day",
                              childteen=='T' & p6h77 == "Missing"~"Missing"))%>%
              mutate(SmkAtVisitYesNo=case_when(SmkAtVisitPastmonth=="No smoking"~"No", 
                                               SmkAtVisitPastmonth %in% c("Less than pack a day", "Pack or more a day") ~ "Yes", 
                                               childteen=='T'& SmkAtVisitPastmonth=="Missing" & !(p6h76 %in% c("-6 Skip", "0 None", "Missing"))~"Yes",
                                               childteen=='C' & SmkAtVisitPastmonth=="Missing" & m5g17=='1 yes'~"Yes", 
                                               childteen=='C' & SmkAtVisitPastmonth=="Missing" & m5g17!='1 yes'~"Missing", 
                                               childteen=='T'& SmkAtVisitPastmonth=="Missing" & p6h76 %in% c("-6 Skip", "0 None", "Missing")~"Missing"))

myFF %>% filter(childteen=='C') %>% xtabs(~SmkAtVisitPastmonth+SmkAtVisitYesNo, data=.)
##                       SmkAtVisitYesNo
## SmkAtVisitPastmonth    Missing  No Yes
##   Less than pack a day       0   0 193
##   Missing                   13   0   0
##   No smoking                 0 544   0
##   Pack or more a day         0   0  78
myFF %>% filter(childteen=='T') %>% xtabs(~SmkAtVisitPastmonth+SmkAtVisitYesNo, data=.)
##                       SmkAtVisitYesNo
## SmkAtVisitPastmonth    Missing  No Yes
##   Less than pack a day       0   0 184
##   Missing                    3   0   2
##   No smoking                 0 628   0
##   Pack or more a day         0   0  39
myFF$SmkAtVisitPastmonth<-factor(myFF$SmkAtVisitPastmonth)

2.5.3 Child’s age at time of interview/visit

myFF %>% filter(is.na(cp6yagem) &is.na(hv6_yagem) & childteen=='T')%>%dim() #3
## [1]   3 112
myFF %>% filter(is.na(cm5b_age) &is.na(hv5_agem) &childteen=='C')%>%dim() #1 
## [1]   0 112
myFF %>% filter(!is.na(cm5b_age) & is.na(hv5_agem) & childteen=='C')%>%dim()#1
## [1]   1 112
myFF %>% filter(!is.na(cp6yagem) &is.na(hv6_yagem))%>%dim() #990
## [1] 976 112
myFF <- myFF %>% 
  mutate(ChildAgeAtIn=case_when(childteen=='C'~ cm5b_age, childteen=='T'~cp6yagem))%>%
  mutate(ChildAgeAtVisit=case_when(childteen=='C'~hv5_agem, childteen=='T'~hv6_yagem))%>% 
  mutate(ChildAgeComposite=case_when(childteen=='C' & !is.na(hv5_agem)~hv5_agem, 
         childteen=='C' & is.na(hv5_agem)~cm5b_age, 
         childteen=='T' & !is.na(hv6_yagem)~hv6_yagem, 
         childteen=='T' & is.na(hv6_yagem)~cp6yagem))

myFF %>% filter(childteen=='C')%>%group_by(ChildAgeAtVisit, ChildAgeComposite)%>%summarise(n=n())%>%filter(ChildAgeAtVisit!=ChildAgeComposite)
## `summarise()` regrouping output by 'ChildAgeAtVisit' (override with `.groups` argument)
## # A tibble: 0 x 3
## # Groups:   ChildAgeAtVisit [0]
## # … with 3 variables: ChildAgeAtVisit <dbl>, ChildAgeComposite <dbl>, n <int>
myFF %>% filter(childteen=='T')%>%group_by(ChildAgeAtVisit, ChildAgeComposite)%>%summarise(n=n())%>%filter(ChildAgeAtVisit!=ChildAgeComposite)
## `summarise()` regrouping output by 'ChildAgeAtVisit' (override with `.groups` argument)
## # A tibble: 0 x 3
## # Groups:   ChildAgeAtVisit [0]
## # … with 3 variables: ChildAgeAtVisit <dbl>, ChildAgeComposite <dbl>, n <int>
ggplot(myFF%>%filter(childteen=='C'), aes(cm5b_age, hv5_agem))+geom_point()+stat_cor()+ggtitle("Correlation between childs age at mom interview (9 yr) and childs age at home visit (9 yr)")
## Warning: Removed 23 rows containing non-finite values (stat_cor).
## Warning: Removed 23 rows containing missing values (geom_point).

age5resid<-myFF %>% filter(childteen=='C')%>%mutate(age5_resid=cm5b_age-hv5_agem)%>%pull(age5_resid)
hist(age5resid)

age6resid<-myFF %>% filter(childteen=='T')%>%mutate(age6_resid=cp6yagem-hv6_yagem)%>%pull(age6_resid)
ggplot(myFF%>%filter(childteen=='T'), aes(cp6yagem,hv6_yagem))+geom_point()+stat_cor()+ggtitle("Correlation between childs age at mom interview (15 yr) and childs age at home visit (15 yr)")
## Warning: Removed 498 rows containing non-finite values (stat_cor).
## Warning: Removed 498 rows containing missing values (geom_point).

hist(age6resid)

table(FF_labeled$cp6yagem-FF_labeled$hv6_yagem)
## 
##    0    5 
## 1088    1
table(FF_labeled$cm5b_age-FF_labeled$hv5_agem)
## 
##  -21  -18  -15  -14  -13  -12  -11  -10   -9   -8   -7   -6   -5   -4   -3   -2 
##    1    1    2    4    3    6   15   10    5   10   13   16   13   24   52   70 
##   -1    0    1    2    3    4    5    6    7    8   10   11   12   13   14   15 
##  123  189 1424 1222   18   12    5   15    4    4    3    2    1    1    1    3 
##   16   17   21 
##    1    1    1
table(is.na(myFF$ChildAgeComposite)) # 4
## 
## FALSE  TRUE 
##  1682     3
varLabels4<-c(varLabels3, "Mother/PCG (9/15 yr) smoking in month prior to interview (pk/day)", "Mother/PCG (9/15 yr) smoking in month prior to interview (yes/no)", "Child's age at maternal/PCG (9/15 yr) interview (months)", "Child's age at home vist (9/15 years) (months)", "Constructed child's age at home visit (in months) (9/15 years)")
myFF<-set_label(myFF, varLabels4)

I also found two variables for child’s age at home visit, but the variable for child’s age at home visit for 15 years has a large amount of missingness in it (missing n=498). The variable for child’s age at home visit for 9 year visit is better (missing n=9). The interview age variables correlate well with the home visit age variables. For the 9 year interview/home visit ages, there are 5 individuals with >12 month difference between the ages, for the 15 year visit, there are 0 (and actually, there is 0 difference for any of the individuals with nonmissing home visit ages). So one option would be to use the home visit ages for the 9 year visit and for individuals with home visit age data available for the 15 year visit, supplementing the 15 year visit age variable with the 15 year PCG interview age visit for those missing the home visit age. Alternatively we could use baby birthdate and saliva colleciton date to create an age variable. I have the baby DOB variable, but I don’t have/don’t know where a saliva sample collection date would be.

3 Analysis

3.1 Randomization of prenatal maternal smoking across slide numbers

slide_count<-myFF %>% mutate(slide=as.factor(slide))%>% group_by(smkPreg_binary, slide, .drop=F)%>%count()%>%pivot_wider(id_cols = slide, names_from=smkPreg_binary, values_from=n)%>%arrange(Yes)%>%mutate(Proportion=Yes/(No+Yes))
hist_data<-slide_count %>% ggplot(aes(x=Proportion))+geom_histogram(binwidth = 0.1, fill='blue', alpha=0.5, breaks=seq(0, 1, 0.1))+ggtitle('Histogram proprotion smk (from data)')
hist_data_c<-slide_count %>% ggplot(aes(x=Yes))+geom_histogram( fill='blue', alpha=0.5, breaks=c(0:12))+scale_x_continuous(breaks=c(0:12))+ggtitle('Histogram # Yes smk (from data)')

kable(slide_count)%>%kable_styling()%>%scroll_box(width='100%', height='400px')
slide No Yes Proportion
200347970068 12 0 0.0000000
200394970127 12 0 0.0000000
200397540068 12 0 0.0000000
200490750005 9 0 0.0000000
200490750009 11 0 0.0000000
200490750011 11 0 0.0000000
200490750063 8 0 0.0000000
200490750108 12 0 0.0000000
200490750124 12 0 0.0000000
200556930100 11 0 0.0000000
200556930111 10 0 0.0000000
200770460118 9 0 0.0000000
200770460172 12 0 0.0000000
200770460173 10 0 0.0000000
200863810009 12 0 0.0000000
200863810070 12 0 0.0000000
200863810114 10 0 0.0000000
200866140081 9 0 0.0000000
200866140163 12 0 0.0000000
201502620070 11 0 0.0000000
201502620071 11 0 0.0000000
201561870093 4 0 0.0000000
201561870123 12 0 0.0000000
3999984076 12 0 0.0000000
3999984094 12 0 0.0000000
3999984111 12 0 0.0000000
3999997019 12 0 0.0000000
9979858077 12 0 0.0000000
9979858078 12 0 0.0000000
9979858102 10 0 0.0000000
9979858118 12 0 0.0000000
9979858128 12 0 0.0000000
9986360014 12 0 0.0000000
9986360015 12 0 0.0000000
200397860021 10 1 0.0909091
200397860035 10 1 0.0909091
200490750003 10 1 0.0909091
200863810112 7 1 0.1250000
201502620017 10 1 0.0909091
201561870100 7 1 0.1250000
3999932043 9 1 0.1000000
9829118090 9 1 0.1000000
9829118105 9 1 0.1000000
9986360019 11 1 0.0833333
200347970051 10 2 0.1666667
200347970066 10 2 0.1666667
200347970071 10 2 0.1666667
200394970128 10 2 0.1666667
200397540074 10 2 0.1666667
200397860024 10 2 0.1666667
200397860046 7 2 0.2222222
200397860066 9 2 0.1818182
200397870011 8 2 0.2000000
200400320071 10 2 0.1666667
200490750001 10 2 0.1666667
200490750006 7 2 0.2222222
200490750021 10 2 0.1666667
200490750053 9 2 0.1818182
200490750074 8 2 0.2000000
200490750079 5 2 0.2857143
200490750081 10 2 0.1666667
200490750106 7 2 0.2222222
200490750107 10 2 0.1666667
200490750148 10 2 0.1666667
200556930110 10 2 0.1666667
200556930121 9 2 0.1818182
200556930130 9 2 0.1818182
200770460063 7 2 0.2222222
200863810019 10 2 0.1666667
200863810029 10 2 0.1666667
200863810053 9 2 0.1818182
200866140164 10 2 0.1666667
200866140165 10 2 0.1666667
200866140214 10 2 0.1666667
201502620002 9 2 0.1818182
201502620003 9 2 0.1818182
201502620010 7 2 0.2222222
201502620024 8 2 0.2000000
201502620039 8 2 0.2000000
201502620073 10 2 0.1666667
201502620074 9 2 0.1818182
201502620078 10 2 0.1666667
201561870001 10 2 0.1666667
201561870002 10 2 0.1666667
201561870060 8 2 0.2000000
201561870068 8 2 0.2000000
201561870083 10 2 0.1666667
201561870090 10 2 0.1666667
201561870091 10 2 0.1666667
201561870092 10 2 0.1666667
201561870101 8 2 0.2000000
201561870126 6 2 0.2500000
201561870136 9 2 0.1818182
3999932048 9 2 0.1818182
3999932050 10 2 0.1666667
3999932051 10 2 0.1666667
3999984051 10 2 0.1666667
3999984077 10 2 0.1666667
3999984080 7 2 0.2222222
3999984088 9 2 0.1818182
3999984090 10 2 0.1666667
3999984108 10 2 0.1666667
9829118115 5 2 0.2857143
9829118137 9 2 0.1818182
9979858076 10 2 0.1666667
9979858105 10 2 0.1666667
9986360012 10 2 0.1666667
9986360033 9 2 0.1818182
200556930073 9 3 0.2500000
200556930095 9 3 0.2500000
200770460062 8 3 0.2727273
200863730056 9 3 0.2500000
201502620016 7 3 0.3000000
3999932006 5 3 0.3750000
3999932091 9 3 0.2500000
3999932092 6 3 0.3333333
200347970040 8 4 0.3333333
200347970050 6 4 0.4000000
200347970069 8 4 0.3333333
200397540092 8 4 0.3333333
200397860096 6 4 0.4000000
200490750077 8 4 0.3333333
200490750082 8 4 0.3333333
200490750090 8 4 0.3333333
200490750091 3 4 0.5714286
200490750093 8 4 0.3333333
200556930109 6 4 0.4000000
200863730043 8 4 0.3333333
200863730057 8 4 0.3333333
200866140028 6 4 0.4000000
200866140079 8 4 0.3333333
200866140238 8 4 0.3333333
3999984091 8 4 0.3333333
3999984112 8 4 0.3333333
3999997140 8 4 0.3333333
9979858131 8 4 0.3333333
9986360027 8 4 0.3333333
200490750095 7 5 0.4166667
201561870059 7 5 0.4166667
3999932036 6 5 0.4545455
200347970036 6 6 0.5000000
200397540069 6 6 0.5000000
200397860094 6 6 0.5000000
200490750080 2 6 0.7500000
200866140159 6 6 0.5000000
3999984067 6 6 0.5000000
9986360025 6 6 0.5000000
9986360046 6 6 0.5000000
200397540040 4 8 0.6666667
201561870003 4 8 0.6666667
3999984100 4 8 0.6666667
3999984114 4 8 0.6666667
#simulation
set.seed(seed=20210216)
n<-nrow(myFF)
yes<-table(myFF$smkPreg_binary)[2]; no<-table(myFF$smkPreg_binary)[1]
values<-data.frame('smk'=sample(c('No', 'Yes'), n, replace=T, prob=c(no/(no+yes), yes/(no+yes))))
values_slides<-values %>% 
    group_by((row_number()-1) %/% (n()/152)) %>%
    nest()%>%unnest(cols=c(data))
colnames(values_slides)<-c('slide', 'smk')
slide_sim<-values_slides%>%mutate(slide=as.factor(slide)) %>% group_by(smk, slide, .drop=F)%>%count()%>%pivot_wider(id_cols = slide, names_from=smk, values_from=n)%>%arrange(Yes)%>%mutate(Proportion=Yes/(No+Yes))
hist_sim<-slide_sim%>% ggplot(aes(x=Proportion))+geom_histogram(binwidth = 0.1, fill='red', alpha=0.5, breaks=seq(0, 1, 0.1))+ggtitle('Histogram proprotion smk (from simulation)')
hist_sim_c<-slide_sim%>% ggplot(aes(x=Yes))+geom_histogram(fill='red', alpha=0.5, breaks=c(0:12))+scale_x_continuous(breaks=c(0:12))+ggtitle('Histogram # Yes smk (from simulation)')


hist_overlay<-slide_count %>% ggplot(aes(x=Proportion))+geom_histogram(fill='blue', binwidth = 0.1, alpha=0.5, breaks=seq(0, 1, 0.1))+geom_histogram(aes(x=slide_sim$Proportion), breaks=seq(0, 1, 0.1), fill='red', binwidth = 0.1, alpha=0.5)+xlim(0,1)
hist_overlay_c<-slide_count %>% ggplot(aes(x=Yes))+geom_histogram(fill='blue', alpha=0.5, breaks=c(0:12))+geom_histogram(aes(x=slide_sim$Yes), fill='red',  alpha=0.5, breaks=c(0:12))+scale_x_continuous(breaks=c(0:12))



plot_grid(plot_grid(hist_data, hist_sim, nrow=2), hist_overlay, nrow=1)

plot_grid(plot_grid(hist_data_c, hist_sim_c, nrow=2), hist_overlay_c, nrow=1)

3.2 Comparison of analytic sample - methylation data to Fragile Families sample

FF_labeled<-set_label(FF_labeled, varLabels)
methyl<-createTable(compareGroups(MethylData~smkPreg_binary+cm1inpov+cm1ethrace+cm1bsex+m1g2_YesNoPreg+m1g3_YesNoPreg+PostnatalMaternalSmokingAny+k5f1l+k6d40+f1g4, data=FF_labeled), show.all=T)
export2md(methyl, caption="Exclusion table of all Fragile Families vs those with any methylation data for main predictors and key demographic variables from first wave")
Exclusion table of all Fragile Families vs those with any methylation data for main predictors and key demographic variables from first wave
[ALL] Analysis subset Not in analysis subset p.overall
N=4898 N=880 N=4018
Any maternal smoking during pregnancy: 0.230
Missing 12 (0.24%) 0 (0.00%) 12 (0.30%)
No 3934 (80.3%) 702 (79.8%) 3232 (80.4%)
Yes 952 (19.4%) 178 (20.2%) 774 (19.3%)
cm1inpov Constructed - Poverty ratio - mother’s household income/poverty threshold 2.22 (2.41) 2.23 (2.43) 2.22 (2.41) 0.981
cm1ethrace Mother race (baseline own report): .
1 white, non-hispanic 1030 (21.0%) 174 (19.8%) 856 (21.3%)
2 black, non-hispanic 2326 (47.5%) 485 (55.1%) 1841 (45.8%)
3 hispanic 1336 (27.3%) 188 (21.4%) 1148 (28.6%)
4 other 194 (3.96%) 31 (3.52%) 163 (4.06%)
Missing 12 (0.24%) 2 (0.23%) 10 (0.25%)
cm1bsex Constructed - Focal baby’s gender: 0.348
1 Boy 2568 (52.4%) 444 (50.5%) 2124 (52.9%)
2 Girl 2329 (47.6%) 436 (49.5%) 1893 (47.1%)
Missing 1 (0.02%) 0 (0.00%) 1 (0.02%)
Any alcohol during pregnancy: 0.635
Missing 13 (0.27%) 2 (0.23%) 11 (0.27%)
No 4364 (89.1%) 777 (88.3%) 3587 (89.3%)
Yes 521 (10.6%) 101 (11.5%) 420 (10.5%)
Any drugs during pregnancy: 0.478
Missing 11 (0.22%) 3 (0.34%) 8 (0.20%)
No 4616 (94.2%) 833 (94.7%) 3783 (94.2%)
Yes 271 (5.53%) 44 (5.00%) 227 (5.65%)
Any postnatal smoking (age 1 & 5): <0.001
Maternal smoking at age 1 or age 5 1545 (31.5%) 333 (37.8%) 1212 (30.2%)
Missing 827 (16.9%) 47 (5.34%) 780 (19.4%)
No maternal smoking at age 1 and age 5 2526 (51.6%) 500 (56.8%) 2026 (50.4%)
k5f1l F1L. Smoked a cigarette or used tobacco: <0.001
1 yes 26 (0.53%) 6 (0.68%) 20 (0.50%)
2 no 3313 (67.6%) 859 (97.6%) 2454 (61.1%)
Missing 1559 (31.8%) 15 (1.70%) 1544 (38.4%)
k6d40 ID: <0.001
1 Yes 185 (3.78%) 41 (4.66%) 144 (3.58%)
2 No 3244 (66.2%) 829 (94.2%) 2415 (60.1%)
Missing 1469 (30.0%) 10 (1.14%) 1459 (36.3%)
f1g4 In the past 3 months, how many cigarettes did you smoke a day?: 0.029
1 2+pk/d 62 (1.27%) 11 (1.25%) 51 (1.27%)
2 1<pk<2 364 (7.43%) 73 (8.30%) 291 (7.24%)
3 <1pk/d 1067 (21.8%) 204 (23.2%) 863 (21.5%)
4 None 2327 (47.5%) 434 (49.3%) 1893 (47.1%)
Missing 1078 (22.0%) 158 (18.0%) 920 (22.9%)

3.3 Comparison of variables by ancestry

3.4 Flow chart of sample exclusions

methylCohort<-pdqc_all %>% filter(childteen!='M')

basemodelvar<-c("smkPreg_binary", "ancestry", "cm1bsex", "cm1inpov", "Epi", "IC", "ChildAgeComposite")
basemodeldata<-myFF %>% filter_at(all_of(basemodelvar), all_vars(!(. %in%c("Missing", "Missing PC data"))& !is.na(.)))
child_base<-basemodeldata%>%filter(childteen=='C')
teen_base<-basemodeldata%>%filter(childteen=='T')

prenatalexposurevar<-c(basemodelvar, "m1g2_YesNoPreg", "m1g3_YesNoPreg")
prenatalexdata<-basemodeldata %>%filter_at(all_of(prenatalexposurevar), all_vars(!(. %in% c("Missing"))))
child_prenatal<-prenatalexdata%>%filter(childteen=='C')
teen_prenatal<-prenatalexdata%>%filter(childteen=='T')

secondhandsmokevars<-c(prenatalexposurevar, "PostnatalMaternalSmokingAny", "SmkAtVisitPastmonth")
secondhandsmkdata<-prenatalexdata%>%filter_at(all_of(secondhandsmokevars), all_vars(!(. %in% c("Missing"))))
child_secondhand<-secondhandsmkdata %>% filter(childteen=='C')
teen_secondhand<-secondhandsmkdata%>%filter(childteen=='T')
twovisit_secondhand<-secondhandsmkdata %>% filter(idnum %in% child_secondhand$idnum & idnum %in% teen_secondhand$idnum)

child_smoke<-secondhandsmkdata %>% filter(k5f1l!= "2 no" & childteen=='C')
child_nosmoke<-secondhandsmkdata %>% filter(k5f1l=="2 no" & childteen=='C')
teen_smoke<-secondhandsmkdata %>% filter(k6d40!="2 No" & childteen=='T')
teen_nosmoke<-secondhandsmkdata %>% filter(k6d40=="2 No" & childteen=='T')  
twovisit_nosmoke<-secondhandsmkdata %>% filter(idnum %in% child_nosmoke$idnum & idnum %in% teen_nosmoke$idnum)

fathersmkdata<-secondhandsmkdata %>% filter(f1g4!="Missing")

#Make Labels
n_fullFF<-paste0("Fragile Families cohort \n", "n=", FF_labeled %>% nrow())
n_methylQC<-paste0("Cohort with 450K methylation data \n", "n=", methylCohort$idnum%>%unique()%>%length(), "individuals with m=", methylCohort$MethID%>%length(), "samples")

n_QCloss<-paste0("Methylation data QC: 12 outlier samples dropped, \n", pdqc_all%>%filter(probe_fail_10==T)%>%nrow()-12, " samples with >10% of probes failing dropped, \n", pdqc_all%>%filter(sex!=predicted_sex & probe_fail_10==F & childteen!='M')%>%nrow(), " discordant sex samples dropped, \n & ", nrow(pdqc)-nrow(pdqc_clean), " duplicate samples dropped")
n_methylFF<-paste0("Cohort with quality controlled 450K data \n","n=", myFF %>% select(id) %>% unique() %>% nrow(), " individuals with m=", myFF%>%nrow(), " samples")

n_basemodel<-paste0("Base models: \n", "n=", basemodeldata %>% select(id) %>% unique() %>% nrow(), " individuals with m=", basemodeldata %>%nrow(), " samples")
n_baseexclusions<-paste0("n=",  myFF %>% filter_at(all_of(basemodelvar), any_vars((. %in%c("Missing", "Missing PC data"))))%>% select(id) %>% unique() %>% nrow(), " individuals & m=", myFF %>% filter_at(all_of(basemodelvar), any_vars((. %in%c("Missing", "Missing PC data")))) %>% nrow(), " samples missing principal components of ancestry; \n n=", myFF %>% filter_at(all_of(basemodelvar), any_vars((is.na(.))))%>%select(id)%>%unique()%>%nrow(), " individuals & m=", myFF %>% filter_at(all_of(basemodelvar), any_vars((is.na(.))))%>%nrow(), " samples missing child age at maternal/PCG interview")

n_prenatalexposures<-paste0("Prenatal exposures models: \n","n=", prenatalexdata %>% select(id) %>% unique() %>% nrow(), " individuals with m=", prenatalexdata %>%nrow(), " samples")
n_prenatalexclusions<-paste0("n=", basemodeldata %>% filter_at(all_of(prenatalexposurevar), any_vars(. %in% c("Missing"))) %>% select(id) %>% unique() %>% nrow(), " individuals & m=", basemodeldata %>% filter_at(all_of(prenatalexposurevar), any_vars(. %in% c("Missing"))) %>% nrow(), " samples missing \n data on maternal prenatal alcohol and drug use")

n_secondhandSmk<-paste0("Secondhand smoke exposure models: \n","n=", secondhandsmkdata %>% select(id) %>% unique() %>% nrow(), " individuals with m=", secondhandsmkdata %>%nrow(), " samples")
n_secondhandexclusions<-paste0("n=", prenatalexdata$id[!(prenatalexdata$id %in% secondhandsmkdata$id)] %>% unique()%>% length(), " individuals & m=", prenatalexdata %>% filter_at(all_of(secondhandsmokevars), any_vars(. %in% c("Missing"))) %>% nrow(), " samples missing data \n on maternal or primary care giver postnatal smoking")

n_Child<-paste0('n=', child_secondhand  %>% nrow(), " individuals with a 9 year visit")
n_ChildSmkexcluded<-paste0("Sensitivity analysis exclude children who smoke: \n",'n=', child_nosmoke %>% nrow(), " individuals with a 9 year visit")
n_ChildSmkexclusions<-paste0('n=', child_smoke%>%nrow(), " individual \n smoking at 9 year visit \n or missing focal child smoking data")
n_Teen<-paste0('n=', teen_secondhand  %>% nrow(), " individuals with a 15 year visit")
n_TeenSmkexcluded<-paste0("Sensitivity analysis exclude children who smoke: \n",'n=', teen_nosmoke %>% nrow(), " individuals with a 15 year visit")
n_TeenSmkexclusions<-paste0('n=', teen_smoke%>%nrow(), " individuals \n smoking at 15 year visit \n or missing focal child smoking data") 

n_sensFatherSmoking<-paste0("Sensitivity analysis: compare effects of maternal and paternal prenatal smoking \n", "n=", fathersmkdata %>% select(id) %>% unique() %>% nrow(), " individuals with m=", fathersmkdata %>%nrow(), " samples", "\n with prenatal paternal smoking data")
n_sensFatherSmokingEx<- paste0("n=", secondhandsmkdata$id[!(secondhandsmkdata$id %in% fathersmkdata$id)] %>% unique()%>% length(), " individuals & m=", secondhandsmkdata %>% filter(f1g4 %in% c("Missing")) %>% nrow(), " samples missing data on paternal prenatal smoking")

n_twovisits<-paste0("n=", twovisit_secondhand%>%pull(idnum) %>%unique() %>%length(), " individuals with both a 9 and 15 year visit")
n_twovisitsSmkExcluded<-paste0("Sensitivity analysis exclude children who smoke: \n", "n=", twovisit_nosmoke%>%pull(idnum) %>%unique() %>%length(), " individuals with both a 9 and 15 year visit")
n_twovisitSmkExclusions<-paste0("n=", (twovisit_secondhand%>%pull(idnum) %>%unique() %>%length())-(twovisit_nosmoke%>%pull(idnum) %>%unique() %>%length()), " individuals smoking at \n either 9 and 15 year visit or \n missing focal child smoking data")
graph1<-DiagrammeR::grViz("
   digraph graph2 {
   compound=true;
   graph [layout = dot]
   
   node [shape = rectangle, width=4]
   a [label = '@@1']
   b [label = '@@2']
   c [label = '@@3']
   d [label = '@@4']
   e [label = '@@5']
   f [label = '@@6']
   g [label = '@@7']
   h [label = '@@8']
   i [label = '@@9', group=g1]
   j [label = '@@10']
   k [label = '@@16']
   l [label = '@@17']
   m [label = '@@19']
   
   #fake nodes for connecting to edges
   node [shape=point, width=0.01, height=0.01]
   a1
   b1
   c1
   d1
#   e1
  f1 [group=g1]
  g1
  k1
   
  #exclusion nodes
  node [shape = rectangle, width=4]
  aa [label = '@@20']
  bb [label = '@@11']
  cc [label = '@@12']
  dd [label = '@@13']
  ff [label = '@@14', width=3]
  gg [label = '@@15', width=3]
  kk [label = '@@18', width=3]
    
  #draw connections
  a->m
  m->a1 [arrowhead=none]
  a1->b
  b->b1 [arrowhead=none]
  b1->c
  c->c1 [arrowhead=none]
  c1->d
  d->d1 [arrowhead=none]
  d1->e
  e->f; e->g; e->k
{rank=same; h->e [dir=back, minlen=2, style=dashed]}
  f->f1 [arrowhead=none, style=dashed]
  g->g1 [arrowhead=none, style=dashed]
  k->k1 [arrowhead=none, style=dashed]
  f1->i [style=dashed]
  g1->j [style=dashed]
  k1->l [style=dashed]
  
{rank=same; a1->aa [minlen=2]}   
{rank=same; b1->bb [minlen=2]} 
{rank=same; c1->cc [minlen=2]}
{rank=same; d1->dd [minlen=2]}
{rank=same; f1->ff [minlen=2]}
{rank=same; g1->gg [minlen=2]}
{rank=same; k1->kk [minlen=2]}

#invisible constraints to force ordering
ff->i [style=invis]
   }   
   
   [1]: n_fullFF
   [2]: n_methylFF
   [3]: n_basemodel
   [4]: n_prenatalexposures
   [5]: n_secondhandSmk
   [6]: n_Child
   [7]: n_Teen
   [8]: n_sensFatherSmoking
   [9]: n_ChildSmkexcluded
   [10]: n_TeenSmkexcluded
   [11]: n_baseexclusions
   [12]: n_prenatalexclusions
   [13]: n_secondhandexclusions
   [14]: n_ChildSmkexclusions
   [15]: n_TeenSmkexclusions
   [16]: n_twovisits
   [17]: n_twovisitsSmkExcluded
   [18]: n_twovisitSmkExclusions
   [19]: n_methylQC
   [20]: n_QCloss
")
graph1

3.5 Cell type proportions over time

Cell type proportions do differ between 9 and 15 year visits and although global methylation does not look different between these visits, methylation clocks do (a good check), although, interestingly, cell type proportion does correlate with methylation age.

3.6 Distributions & checks of methylation variables

3.6.1 Correlation between all methylation data

methylation_data<-myFF %>% select(colnames(methyldata))%>%select(-MethID, -childteen, -idnum, -slide, -array)
chart.Correlation(methylation_data, histogram=T)

3.6.2 Correlation between polymethylation scores

chart.Correlation(methylation_data %>%select(SSolder_notransform, SSnewbornCT_notransform, SSnewborn_notransform, anynewborn_notransform))

Thought this was a little weird, investigated the actual coefficients for each score:

coefs_methyl<-cbind('SSolderCoefs'=cpgs$SSolder$Coef, 'SSnewbornCTCoefs'=cpgs$SSnewbornCT$Coef, 'SSnewbornCoefs'=cpgs$SSnewborn$Coef, 'anynewbornCoefs'=cpgs$anynewborn$Coef)
chart.Correlation(coefs_methyl)

PMS<-methyldata %>%select(idnum, childteen, colnames(polyscores_wide), -MethID)%>%pivot_longer(cols=any_of(colnames(polyscores_wide)), names_to="PMS_type", values_to="PolymethylationScore")%>%pivot_wider(names_from = childteen, values_from=PolymethylationScore)
ggplot(PMS, aes(x=C, y=T))+geom_point()+facet_grid(~PMS_type)+stat_cor()+geom_smooth(method=lm)
gg_newborn<-ggplot(methyldata, aes(x=childteen, y=SSnewbornCT_center, group=idnum))+geom_point()+geom_line(alpha=0.2)+geom_smooth(aes(group=1), method='lm')+ggtitle("SS newborn (+CT)")
gg_older<-ggplot(methyldata, aes(x=childteen, y=SSolder_center, group=idnum))+geom_point()+geom_line(alpha=0.2)+geom_smooth(aes(group=1), method='lm')+ggtitle("SS older")
plot_grid(gg_newborn, gg_older, nrow = 2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

gg_newborn<-ggplot(myFF, aes(x=childteen, y=SSnewbornCT_center, group=idnum, color=smkPreg_binary))+geom_point()+geom_line(alpha=0.2)+geom_smooth(aes(group=1),color='black', method='lm')+facet_wrap(~smkPreg_binary)
gg_older<-ggplot(myFF, aes(x=childteen, y=SSolder_center, group=idnum, color=smkPreg_binary))+geom_point()+geom_line(alpha=0.2)+geom_smooth(aes(group=1),color='black', method='lm')+facet_wrap(~smkPreg_binary)
plot_grid(gg_newborn, gg_older, nrow=2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

PMS<-left_join(methyldata %>%select(idnum, childteen, colnames(polyscores_wide), -MethID)%>%pivot_longer(cols=any_of(colnames(polyscores_wide)), names_to="PMS_type", values_to="PolymethylationScore"), myFF[, c('idnum', 'smkPreg_binary')]%>%unique())
## Joining, by = "idnum"
ggplot(PMS, aes(x=childteen, y=PolymethylationScore, color=smkPreg_binary))+geom_boxplot()+facet_wrap(~PMS_type)+stat_compare_means(label='p.signif', label.y=4)

3.7 Distribution of main predictor and bivariate tables (stratified by ancestry)

3.7.1 Covariates by main predictor (unstratified)

tabledata<-rbind(child_nosmoke, teen_nosmoke)
tabledata<-set_label(tabledata, varLabels4)
smk_all<-strataTable(createTable(compareGroups(smkPreg_binary~globalmethylation+SSnewbornCT_notransform+SSolder_notransform+cg05575921+pediatric+levine+Epi+Fib+IC+cm1inpov+cm1ethrace+cm1bsex+m1g2_YesNoPreg+m1g3_YesNoPreg+PostnatalMaternalSmokingAny+SmkAtVisitPastmonth+k5f1l+k6d40+f1g4, data=tabledata, simplify = F), show.all=F), 'childteen')
export2md(smk_all)
Summary descriptive tables

C
T
No Yes p.overall No Yes p.overall
N=574 N=149 N=583 N=137
Global methylation 0.48 (0.01) 0.48 (0.01) 0.377 0.48 (0.01) 0.48 (0.01) 0.145
PES: Sustained smoking, newborn cordblood (+CT control) 2.58 (0.22) 2.72 (0.23) <0.001 2.62 (0.22) 2.74 (0.23) <0.001
PES: Sustained smoking, older children peripheral blood 2.35 (0.24) 2.46 (0.25) <0.001 2.35 (0.28) 2.45 (0.28) <0.001
cg05575291 methylation 0.78 (0.05) 0.77 (0.06) 0.022 0.77 (0.06) 0.76 (0.06) 0.323
pediatric methylation clock 8.98 (0.85) 9.02 (1.23) 0.752 11.6 (1.74) 11.6 (1.79) 0.865
levine methylation clock 6.44 (5.79) 6.82 (6.33) 0.500 16.4 (6.21) 16.4 (6.13) 0.981
Epithelial cell proportion 0.27 (0.13) 0.26 (0.12) 0.199 0.30 (0.16) 0.30 (0.16) 0.724
Fibroid cell proportion 0.02 (0.01) 0.02 (0.01) 0.946 0.01 (0.01) 0.01 (0.01) 0.704
Immune cell proportion 0.71 (0.12) 0.73 (0.12) 0.182 0.68 (0.15) 0.69 (0.15) 0.697
cm1inpov Constructed - Poverty ratio - mother’s household income/poverty threshold 2.41 (2.56) 1.45 (1.48) <0.001 2.41 (2.57) 1.37 (1.44) <0.001
cm1ethrace Mother race (baseline own report): <0.001 0.004
1 white, non-hispanic 98 (17.1%) 46 (30.9%) 101 (17.3%) 41 (29.9%)
2 black, non-hispanic 324 (56.4%) 82 (55.0%) 333 (57.1%) 76 (55.5%)
3 hispanic 131 (22.8%) 17 (11.4%) 127 (21.8%) 16 (11.7%)
4 other 19 (3.31%) 4 (2.68%) 20 (3.43%) 4 (2.92%)
Missing 2 (0.35%) 0 (0.00%) 2 (0.34%) 0 (0.00%)
cm1bsex Constructed - Focal baby’s gender: 0.581 0.704
1 Boy 286 (49.8%) 70 (47.0%) 280 (48.0%) 63 (46.0%)
2 Girl 288 (50.2%) 79 (53.0%) 303 (52.0%) 74 (54.0%)
Missing 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%)
Any alcohol during pregnancy: <0.001 <0.001
No 531 (92.5%) 102 (68.5%) 543 (93.1%) 95 (69.3%)
Yes 43 (7.49%) 47 (31.5%) 40 (6.86%) 42 (30.7%)
Any drugs during pregnancy: <0.001 <0.001
No 564 (98.3%) 123 (82.6%) 574 (98.5%) 109 (79.6%)
Yes 10 (1.74%) 26 (17.4%) 9 (1.54%) 28 (20.4%)
Any postnatal smoking (age 1 & 5): <0.001 <0.001
Maternal smoking at age 1 or age 5 146 (25.4%) 142 (95.3%) 151 (25.9%) 130 (94.9%)
No maternal smoking at age 1 and age 5 428 (74.6%) 7 (4.70%) 432 (74.1%) 7 (5.11%)
Mother/PCG (9/15 yr) smoking in month prior to interview (pk/day): <0.001 <0.001
Less than pack a day 91 (15.9%) 82 (55.0%) 82 (14.1%) 75 (54.7%)
Missing 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%)
No smoking 458 (79.8%) 23 (15.4%) 490 (84.0%) 43 (31.4%)
Pack or more a day 25 (4.36%) 44 (29.5%) 11 (1.89%) 19 (13.9%)
k5f1l F1L. Smoked a cigarette or used tobacco: . 0.839
1 yes 0 (0.00%) 0 (0.00%) 3 (0.51%) 0 (0.00%)
2 no 574 (100%) 149 (100%) 571 (97.9%) 136 (99.3%)
Missing 0 (0.00%) 0 (0.00%) 9 (1.54%) 1 (0.73%)
k6d40 ID: 0.003 .
1 Yes 17 (2.96%) 14 (9.40%) 0 (0.00%) 0 (0.00%)
2 No 552 (96.2%) 133 (89.3%) 583 (100%) 137 (100%)
Missing 5 (0.87%) 2 (1.34%) 0 (0.00%) 0 (0.00%)
f1g4 In the past 3 months, how many cigarettes did you smoke a day?: <0.001 <0.001
1 2+pk/d 8 (1.39%) 3 (2.01%) 7 (1.20%) 3 (2.19%)
2 1<pk<2 42 (7.32%) 20 (13.4%) 41 (7.03%) 20 (14.6%)
3 <1pk/d 117 (20.4%) 57 (38.3%) 123 (21.1%) 49 (35.8%)
4 None 316 (55.1%) 43 (28.9%) 317 (54.4%) 41 (29.9%)
Missing 91 (15.9%) 26 (17.4%) 95 (16.3%) 24 (17.5%)

Note hypomethylation in exposed children at cg05575291 (AHHR) gene, which is more strongly associated at the Teen visit than the Child visit…. although not a very small p-value - check with Kelly - is this concerning?

Additionally, cell type proportion does not appear to associate with main predictor of binary prenatal smoking variable.

3.7.2 Covariates by main predictor (stratified by ancestry)

These tables use the base model group from the above CONSORT diagram (n=844).

smkA<-strataTable(createTable(compareGroups(smkPreg_binary~globalmethylation+SSnewbornCT_notransform+SSolder_notransform+cg05575921+pediatric+levine+Epi+Fib+IC+cm1inpov+cm1ethrace+cm1bsex+m1g2_YesNoPreg+m1g3_YesNoPreg+PostnatalMaternalSmokingAny+SmkAtVisitPastmonth+k5f1l+k6d40+f1g4, data=tabledata%>%filter(ancestry=='African ancestry')%>%copy_labels(tabledata)), show.all=F), 'childteen')
export2md(smkA, caption="Smoked during pregnancy vs covariates: African ancestry only")
Smoked during pregnancy vs covariates: African ancestry only

C
T
No Yes p.overall No Yes p.overall
N=356 N=96 N=363 N=86
Global methylation 0.48 (0.01) 0.48 (0.01) 0.612 0.48 (0.01) 0.48 (0.01) 0.471
PES: Sustained smoking, newborn cordblood (+CT control) 2.59 (0.23) 2.73 (0.22) <0.001 2.63 (0.22) 2.76 (0.21) <0.001
PES: Sustained smoking, older children peripheral blood 2.34 (0.25) 2.44 (0.26) 0.001 2.35 (0.29) 2.41 (0.31) 0.068
cg05575291 methylation 0.78 (0.05) 0.76 (0.06) 0.010 0.77 (0.06) 0.75 (0.07) 0.082
pediatric methylation clock 9.06 (0.80) 9.16 (1.41) 0.509 11.7 (1.74) 12.0 (1.89) 0.303
levine methylation clock 6.53 (6.07) 7.30 (6.83) 0.318 16.4 (6.16) 16.7 (6.50) 0.707
Epithelial cell proportion 0.29 (0.13) 0.28 (0.13) 0.486 0.31 (0.16) 0.32 (0.17) 0.511
Fibroid cell proportion 0.02 (0.01) 0.02 (0.01) 0.780 0.01 (0.01) 0.01 (0.01) 0.729
Immune cell proportion 0.70 (0.13) 0.71 (0.12) 0.457 0.67 (0.16) 0.66 (0.16) 0.507
cm1inpov Constructed - Poverty ratio - mother’s household income/poverty threshold 1.93 (1.92) 1.18 (1.28) <0.001 1.92 (1.91) 1.06 (1.12) <0.001
cm1ethrace Mother race (baseline own report): 0.021 0.228
1 white, non-hispanic 10 (2.81%) 8 (8.33%) 9 (2.48%) 4 (4.65%)
2 black, non-hispanic 320 (89.9%) 82 (85.4%) 329 (90.6%) 76 (88.4%)
3 hispanic 20 (5.62%) 2 (2.08%) 18 (4.96%) 2 (2.33%)
4 other 5 (1.40%) 4 (4.17%) 6 (1.65%) 4 (4.65%)
Missing 1 (0.28%) 0 (0.00%) 1 (0.28%) 0 (0.00%)
cm1bsex Constructed - Focal baby’s gender: 0.908 0.905
1 Boy 179 (50.3%) 47 (49.0%) 180 (49.6%) 42 (48.8%)
2 Girl 177 (49.7%) 49 (51.0%) 183 (50.4%) 44 (51.2%)
Missing 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%)
Any alcohol during pregnancy: <0.001 <0.001
No 333 (93.5%) 64 (66.7%) 340 (93.7%) 59 (68.6%)
Yes 23 (6.46%) 32 (33.3%) 23 (6.34%) 27 (31.4%)
Any drugs during pregnancy: <0.001 <0.001
No 348 (97.8%) 77 (80.2%) 356 (98.1%) 67 (77.9%)
Yes 8 (2.25%) 19 (19.8%) 7 (1.93%) 19 (22.1%)
Any postnatal smoking (age 1 & 5): <0.001 <0.001
Maternal smoking at age 1 or age 5 93 (26.1%) 93 (96.9%) 95 (26.2%) 83 (96.5%)
No maternal smoking at age 1 and age 5 263 (73.9%) 3 (3.12%) 268 (73.8%) 3 (3.49%)
Mother/PCG (9/15 yr) smoking in month prior to interview (pk/day): <0.001 <0.001
Less than pack a day 61 (17.1%) 53 (55.2%) 59 (16.3%) 55 (64.0%)
Missing 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%)
No smoking 279 (78.4%) 12 (12.5%) 299 (82.4%) 21 (24.4%)
Pack or more a day 16 (4.49%) 31 (32.3%) 5 (1.38%) 10 (11.6%)
k5f1l F1L. Smoked a cigarette or used tobacco: . 1.000
1 yes 0 (0.00%) 0 (0.00%) 1 (0.28%) 0 (0.00%)
2 no 356 (100%) 96 (100%) 357 (98.3%) 85 (98.8%)
Missing 0 (0.00%) 0 (0.00%) 5 (1.38%) 1 (1.16%)
k6d40 ID: 0.002 .
1 Yes 6 (1.69%) 9 (9.38%) 0 (0.00%) 0 (0.00%)
2 No 346 (97.2%) 86 (89.6%) 363 (100%) 86 (100%)
Missing 4 (1.12%) 1 (1.04%) 0 (0.00%) 0 (0.00%)
f1g4 In the past 3 months, how many cigarettes did you smoke a day?: <0.001 0.001
1 2+pk/d 7 (1.97%) 0 (0.00%) 6 (1.65%) 0 (0.00%)
2 1<pk<2 27 (7.58%) 9 (9.38%) 23 (6.34%) 11 (12.8%)
3 <1pk/d 82 (23.0%) 40 (41.7%) 86 (23.7%) 32 (37.2%)
4 None 182 (51.1%) 27 (28.1%) 184 (50.7%) 24 (27.9%)
Missing 58 (16.3%) 20 (20.8%) 64 (17.6%) 19 (22.1%)
smkE<-strataTable(createTable(compareGroups(smkPreg_binary~globalmethylation+SSnewbornCT_notransform+SSolder_notransform+cg05575921+pediatric+levine+Epi+Fib+IC+cm1inpov+cm1ethrace+cm1bsex+m1g2_YesNoPreg+m1g3_YesNoPreg+PostnatalMaternalSmokingAny+SmkAtVisitPastmonth+k5f1l+k6d40+f1g4, simplify=FALSE, data=tabledata%>%filter(ancestry=='European ancestry' & SmkAtVisitPastmonth!="Missing")%>%copy_labels(tabledata)), show.all=TRUE), 'childteen')
export2md(smkE, caption="Smoked during pregnancy vs covariates: European ancestry only")
Smoked during pregnancy vs covariates: European ancestry only

C
T
[ALL] No Yes p.overall [ALL] No Yes p.overall
N=115 N=81 N=34 N=119 N=86 N=33
Global methylation 0.48 (0.01) 0.48 (0.01) 0.48 (0.01) 0.336 0.48 (0.01) 0.48 (0.01) 0.48 (0.01) 0.928
PES: Sustained smoking, newborn cordblood (+CT control) 2.61 (0.24) 2.56 (0.21) 2.73 (0.26) 0.001 2.64 (0.27) 2.61 (0.26) 2.73 (0.28) 0.037
PES: Sustained smoking, older children peripheral blood 2.39 (0.24) 2.35 (0.21) 2.51 (0.25) 0.001 2.42 (0.24) 2.39 (0.23) 2.48 (0.25) 0.060
cg05575291 methylation 0.78 (0.05) 0.78 (0.05) 0.77 (0.05) 0.453 0.78 (0.05) 0.79 (0.05) 0.77 (0.05) 0.102
pediatric methylation clock 8.85 (0.66) 8.87 (0.64) 8.82 (0.71) 0.695 11.1 (1.48) 11.1 (1.54) 11.2 (1.34) 0.793
levine methylation clock 5.56 (4.79) 5.59 (4.69) 5.48 (5.08) 0.917 14.8 (5.58) 14.4 (5.47) 15.9 (5.80) 0.203
Epithelial cell proportion 0.26 (0.12) 0.27 (0.11) 0.24 (0.12) 0.143 0.27 (0.13) 0.27 (0.13) 0.27 (0.12) 0.952
Fibroid cell proportion 0.02 (0.01) 0.01 (0.01) 0.02 (0.01) 0.309 0.01 (0.01) 0.01 (0.01) 0.01 (0.01) 0.489
Immune cell proportion 0.72 (0.11) 0.71 (0.11) 0.75 (0.11) 0.150 0.72 (0.12) 0.71 (0.13) 0.72 (0.11) 0.911
cm1inpov Constructed - Poverty ratio - mother’s household income/poverty threshold 4.48 (3.38) 5.33 (3.51) 2.45 (1.89) <0.001 4.45 (3.39) 5.23 (3.51) 2.44 (1.95) <0.001
cm1ethrace Mother race (baseline own report): 0.580 0.307
1 white, non-hispanic 111 (96.5%) 79 (97.5%) 32 (94.1%) 115 (96.6%) 84 (97.7%) 31 (93.9%)
2 black, non-hispanic 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%)
3 hispanic 4 (3.48%) 2 (2.47%) 2 (5.88%) 4 (3.36%) 2 (2.33%) 2 (6.06%)
4 other 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%)
Missing 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%)
cm1bsex Constructed - Focal baby’s gender: 0.310 0.414
1 Boy 53 (46.1%) 40 (49.4%) 13 (38.2%) 51 (42.9%) 39 (45.3%) 12 (36.4%)
2 Girl 62 (53.9%) 41 (50.6%) 21 (61.8%) 68 (57.1%) 47 (54.7%) 21 (63.6%)
Missing 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%)
Any alcohol during pregnancy: 0.385 0.048
No 92 (80.0%) 67 (82.7%) 25 (73.5%) 98 (82.4%) 75 (87.2%) 23 (69.7%)
Yes 23 (20.0%) 14 (17.3%) 9 (26.5%) 21 (17.6%) 11 (12.8%) 10 (30.3%)
Any drugs during pregnancy: 0.002 <0.001
No 110 (95.7%) 81 (100%) 29 (85.3%) 112 (94.1%) 86 (100%) 26 (78.8%)
Yes 5 (4.35%) 0 (0.00%) 5 (14.7%) 7 (5.88%) 0 (0.00%) 7 (21.2%)
Any postnatal smoking (age 1 & 5): <0.001 <0.001
Maternal smoking at age 1 or age 5 54 (47.0%) 23 (28.4%) 31 (91.2%) 52 (43.7%) 22 (25.6%) 30 (90.9%)
No maternal smoking at age 1 and age 5 61 (53.0%) 58 (71.6%) 3 (8.82%) 67 (56.3%) 64 (74.4%) 3 (9.09%)
Mother/PCG (9/15 yr) smoking in month prior to interview (pk/day): <0.001 <0.001
Less than pack a day 32 (27.8%) 14 (17.3%) 18 (52.9%) 20 (16.8%) 8 (9.30%) 12 (36.4%)
Missing 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%)
No smoking 66 (57.4%) 60 (74.1%) 6 (17.6%) 85 (71.4%) 73 (84.9%) 12 (36.4%)
Pack or more a day 17 (14.8%) 7 (8.64%) 10 (29.4%) 14 (11.8%) 5 (5.81%) 9 (27.3%)
k5f1l F1L. Smoked a cigarette or used tobacco: . 0.681
1 yes 0 (0.00%) 0 (0.00%) 0 (0.00%) 1 (0.84%) 1 (1.16%) 0 (0.00%)
2 no 115 (100%) 81 (100%) 34 (100%) 115 (96.6%) 82 (95.3%) 33 (100%)
Missing 0 (0.00%) 0 (0.00%) 0 (0.00%) 3 (2.52%) 3 (3.49%) 0 (0.00%)
k6d40 ID: 0.322 .
1 Yes 6 (5.22%) 3 (3.70%) 3 (8.82%) 0 (0.00%) 0 (0.00%) 0 (0.00%)
2 No 107 (93.0%) 77 (95.1%) 30 (88.2%) 119 (100%) 86 (100%) 33 (100%)
Missing 2 (1.74%) 1 (1.23%) 1 (2.94%) 0 (0.00%) 0 (0.00%) 0 (0.00%)
f1g4 In the past 3 months, how many cigarettes did you smoke a day?: <0.001 <0.001
1 2+pk/d 3 (2.61%) 1 (1.23%) 2 (5.88%) 3 (2.52%) 1 (1.16%) 2 (6.06%)
2 1<pk<2 14 (12.2%) 4 (4.94%) 10 (29.4%) 14 (11.8%) 6 (6.98%) 8 (24.2%)
3 <1pk/d 21 (18.3%) 11 (13.6%) 10 (29.4%) 23 (19.3%) 13 (15.1%) 10 (30.3%)
4 None 60 (52.2%) 53 (65.4%) 7 (20.6%) 63 (52.9%) 55 (64.0%) 8 (24.2%)
Missing 17 (14.8%) 12 (14.8%) 5 (14.7%) 16 (13.4%) 11 (12.8%) 5 (15.2%)
smkH<-strataTable(createTable(compareGroups(smkPreg_binary~globalmethylation+SSnewbornCT_notransform+SSolder_notransform+cg05575921+pediatric+levine+Epi+Fib+IC+cm1inpov+cm1ethrace+cm1bsex+m1g2_YesNoPreg+m1g3_YesNoPreg+PostnatalMaternalSmokingAny+SmkAtVisitPastmonth+k5f1l+k6d40+f1g4, data=tabledata%>%filter(ancestry=='Hispanic ancestry')%>%copy_labels(tabledata)), show.all=F), 'childteen')
export2md(smkH, caption="Smoked during pregnancy vs covariates: Hispanic ancestry only")
Smoked during pregnancy vs covariates: Hispanic ancestry only

C
T
No Yes p.overall No Yes p.overall
N=137 N=19 N=134 N=18
Global methylation 0.48 (0.01) 0.48 (0.01) 0.611 0.48 (0.01) 0.48 (0.01) 0.034
PES: Sustained smoking, newborn cordblood (+CT control) 2.56 (0.21) 2.63 (0.21) 0.203 2.60 (0.20) 2.68 (0.18) 0.085
PES: Sustained smoking, older children peripheral blood 2.37 (0.22) 2.49 (0.21) 0.029 2.34 (0.26) 2.56 (0.15) <0.001
cg05575291 methylation 0.78 (0.05) 0.79 (0.05) 0.481 0.76 (0.06) 0.80 (0.06) 0.028
pediatric methylation clock 8.85 (1.05) 8.65 (0.79) 0.335 11.5 (1.82) 10.8 (1.60) 0.086
levine methylation clock 6.71 (5.63) 6.85 (5.60) 0.923 17.7 (6.50) 15.9 (4.99) 0.165
Epithelial cell proportion 0.25 (0.11) 0.22 (0.11) 0.365 0.30 (0.15) 0.21 (0.11) 0.007
Fibroid cell proportion 0.02 (0.01) 0.01 (0.01) 0.404 0.01 (0.01) 0.01 (0.01) 0.622
Immune cell proportion 0.74 (0.11) 0.77 (0.11) 0.323 0.69 (0.14) 0.77 (0.11) 0.007
cm1inpov Constructed - Poverty ratio - mother’s household income/poverty threshold 1.93 (2.23) 1.00 (0.62) <0.001 1.95 (2.29) 0.89 (0.53) <0.001
cm1ethrace Mother race (baseline own report): 0.020 0.011
1 white, non-hispanic 9 (6.57%) 6 (31.6%) 8 (5.97%) 6 (33.3%)
2 black, non-hispanic 4 (2.92%) 0 (0.00%) 4 (2.99%) 0 (0.00%)
3 hispanic 109 (79.6%) 13 (68.4%) 107 (79.9%) 12 (66.7%)
4 other 14 (10.2%) 0 (0.00%) 14 (10.4%) 0 (0.00%)
Missing 1 (0.73%) 0 (0.00%) 1 (0.75%) 0 (0.00%)
cm1bsex Constructed - Focal baby’s gender: 0.810 0.803
1 Boy 67 (48.9%) 10 (52.6%) 61 (45.5%) 9 (50.0%)
2 Girl 70 (51.1%) 9 (47.4%) 73 (54.5%) 9 (50.0%)
Missing 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%)
Any alcohol during pregnancy: 0.001 0.004
No 131 (95.6%) 13 (68.4%) 128 (95.5%) 13 (72.2%)
Yes 6 (4.38%) 6 (31.6%) 6 (4.48%) 5 (27.8%)
Any drugs during pregnancy: 0.073 0.069
No 135 (98.5%) 17 (89.5%) 132 (98.5%) 16 (88.9%)
Yes 2 (1.46%) 2 (10.5%) 2 (1.49%) 2 (11.1%)
Any postnatal smoking (age 1 & 5): <0.001 <0.001
Maternal smoking at age 1 or age 5 30 (21.9%) 18 (94.7%) 34 (25.4%) 17 (94.4%)
No maternal smoking at age 1 and age 5 107 (78.1%) 1 (5.26%) 100 (74.6%) 1 (5.56%)
Mother/PCG (9/15 yr) smoking in month prior to interview (pk/day): <0.001 0.002
Less than pack a day 16 (11.7%) 11 (57.9%) 15 (11.2%) 8 (44.4%)
Missing 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%)
No smoking 119 (86.9%) 5 (26.3%) 118 (88.1%) 10 (55.6%)
Pack or more a day 2 (1.46%) 3 (15.8%) 1 (0.75%) 0 (0.00%)
k5f1l F1L. Smoked a cigarette or used tobacco: . 1.000
1 yes 0 (0.00%) 0 (0.00%) 1 (0.75%) 0 (0.00%)
2 no 137 (100%) 19 (100%) 132 (98.5%) 18 (100%)
Missing 0 (0.00%) 0 (0.00%) 1 (0.75%) 0 (0.00%)
k6d40 ID: 0.350 .
1 Yes 8 (5.84%) 2 (10.5%) 0 (0.00%) 0 (0.00%)
2 No 129 (94.2%) 17 (89.5%) 134 (100%) 18 (100%)
Missing 0 (0.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%)
f1g4 In the past 3 months, how many cigarettes did you smoke a day?: 0.054 0.020
1 2+pk/d 0 (0.00%) 1 (5.26%) 0 (0.00%) 1 (5.56%)
2 1<pk<2 11 (8.03%) 1 (5.26%) 12 (8.96%) 1 (5.56%)
3 <1pk/d 24 (17.5%) 7 (36.8%) 24 (17.9%) 7 (38.9%)
4 None 81 (59.1%) 9 (47.4%) 78 (58.2%) 9 (50.0%)
Missing 21 (15.3%) 1 (5.26%) 20 (14.9%) 0 (0.00%)

When stratified by ancestry, relationship between cg05575291 methylation and smoking remains in African ancestry group, pattern of hypomethylation exists in teen measures of European ancestry group but not child measures, and actually non-significant hypermethylation in Hispanic ancestry group.

3.8 Correlation plots between all continuous variables

3.9 Associations with global methylation

At child and teen visit, variables to do with Father and Mother hispanic identity and ethnicity associated with global methylation, as did baby sex.

Some maternal/primary care giver smoking variables from year 1, 9 and 15 also associated. A note on the question PCG Year 9 Smoke and related questions n5f17 and n5f18 - these variables are weird, with lots of missing values, need to check the FF question guide, but my impression is that they were only asked to children for whom the PCG was not the mom (PCG and Core mother survey were combined in wave 6 - year 15).

3.10 Associations with cg05575921 methylation and other significant sites from Wiklund et al 2019

Base model controls for ancestry, childsex, maternal income to poverty ratio, Epi cells, IC cells, and child age in months at sample.

#define function for use in purr 
apriori_fun = function(response, dat, groupF, vars){
  exclude=ifelse(groupF=='C', 'cp6yagem', ifelse(groupF=='T', 'cm5b_age', NA))
  vars=vars[vars!=exclude]
  form=paste0(response, "~", paste(vars, collapse='+'))
  tidy(lm(as.formula(form), data=dat %>% filter(childteen==groupF)), conf.int=T, conf.level=0.95)
}

#read in top CpGs from Table One of Wicklund et al 2019
topCPGs<-read.csv(paste0(datadir, "TopCpGsWiklundEtAl2019.csv"))
topCPGs<-topCPGs %>% filter(CpG %in% aprioriCG)%>% mutate(estimate=as.numeric(gsub("−\\s", "-", gsub(" .*", "", β..SE.))), std.error=as.numeric(sub("\\).*", "", sub(".*\\(", "", β..SE.))), p.value=P.value, source="Wiklund et al. 2019", conf.low=(estimate-1.96*std.error), conf.high=(estimate+1.96*std.error))%>%select(CpG, Chr, Position, Gene, estimate, std.error, p.value, conf.low, conf.high, source)

#purr over the aprioriCpGs
vars=aprioriCG
vars=purrr::set_names(vars)
#ages=c('C', 'T')
#ages=purrr::set_names(ages)
models_C=  vars%>% purrr::map(apriori_fun, dat=basemodeldata, groupF='C', vars=basemodelvar)
models_T=  vars%>% purrr::map(apriori_fun, dat=basemodeldata, groupF='T', vars=basemodelvar)
models=list('child'=models_C, 'teens'=models_T)
#pull out list of tidy models into a datafram 
modeldf<-left_join(bind_rows(unlist(models, recursive=F), .id="id") %>% filter(term=='smkPreg_binaryYes')%>%mutate(source=paste0("Fragile Families: ", age=gsub("\\..*", "", id)), CpG=gsub(".*\\.", "", id)), topCPGs %>%select(CpG, Chr, Position, Gene))%>%select(CpG, Chr, Position, Gene, estimate, std.error, p.value, conf.low, conf.high, source)
## Joining, by = "CpG"
#combine
modeldf<-rbind(modeldf, topCPGs)%>%mutate(label=paste0(Gene, ": ", CpG))

#plot
ggplot(modeldf, aes(x=estimate, xmin=conf.low, xmax=conf.high, color=source, y=label))+geom_point(position=position_dodge(width=1))+geom_errorbarh(height=0.1, position=position_dodge(width=1))+scale_x_continuous()+scale_y_discrete(name="CpGs (a priori)")+geom_vline(xintercept=0, color='red', linetype="dashed", alpha=0.5)+ggtitle("Wiklund 2019 vs Fragile Families at 4 selected a priori CpGs: \n Base model")

4 Datasets for modeling

Variables

Base model: Binary smoking variable [X], ancestry categorization, baby sex [X], maternal income to poverty ratio [X], child’s age in month at visit of interest (note, currently using a constructed child’s age at visit, where individuals having a missing child’s age at home visit was supplemented with child’s age at home interview) [X], cell types (Epi + IC, may need to take one out),

Prenatal exposures: Base model variables + prenatal maternal alcohol use (Yes/no) [X], prenatal maternal drug use (yes/no)

Secondhand smoke exposure models: Prenatal exposures model+ any maternal smoking at age 1 & 5 [X], maternal or PCG smoking dose at visit of interest (age 9 or 15) [X]

First hand smoking: Secondhand smoke exposure models + exclude children with missing or reported firsthand smoking at visit of interest (age 9 or 15)

Paternal smoking: Filter to individuals with data on prenatal paternal smoking and compare effect size of prenatal paternal to prenatal maternal (as per Wiklund et al. 2019, idea is that this captures additional confounding)

Ancestry specific models - by ancestry groups; replace ancestry control with control for local PCs within ancestry group

Longitduinal models - no additional filtering needed

save(child_base, file=paste0(datadir, 'CreatedData/', 'ChildBaseModelData.Rdata'))
save(teen_base, file=paste0(datadir, 'CreatedData/', 'ChildBaseModelData.Rdata'))
save(basemodeldata, file=paste0(datadir, 'CreatedData/', 'BaseModelData.Rdata'))

save(child_prenatal, file=paste0(datadir, 'CreatedData/', 'ChildPrenatalExModelData.Rdata'))
save(teen_prenatal, file=paste0(datadir, 'CreatedData/', 'PrenatalExModelData.Rdata'))
save(prenatalexdata, file=paste0(datadir, 'CreatedData/', 'PrenatalExModelData.Rdata'))

save(child_secondhand, file=paste0(datadir, 'CreatedData/', 'ChildSecondHandSmkModelData.Rdata'))
save(teen_secondhand, file=paste0(datadir, 'CreatedData/', 'TeenSecondHandSmkModelData.Rdata'))
save(twovisit_secondhand, file=paste0(datadir, 'CreatedData/', 'TwoVisitSecondHandSmkModelData.Rdata'))
save(secondhandsmkdata, file=paste0(datadir, 'CreatedData/', 'SecondHandSmkModelData.Rdata'))

save(child_nosmoke, file=paste0(datadir, 'CreatedData/', 'ChildNoSmkModelData.Rdata'))
save(teen_nosmoke, file=paste0(datadir, 'CreatedData/', 'TeenNoSmkModelData.Rdata'))
save(twovisit_nosmoke, file=paste0(datadir, 'CreatedData/', 'TwoVisitNoSmkModelData.Rdata'))


save(fathersmkdata, file=paste0(datadir, 'CreatedData/', 'FatherSmkModelData.Rdata'))


save(list=c("basemodeldata", "prenatalexdata", "secondhandsmkdata", "child_secondhand", "teen_secondhand", "twovisit_secondhand", "twovisit_nosmoke", "child_nosmoke", "teen_nosmoke", "fathersmkdata"), file=paste0(datadir, 'CreatedData/', Sys.Date(), 'CreatedData.Rdata'))